Apple Developer Connection Student Program

home *** CD-ROM | disk | FTP | other *** search

/ Apple Developer Connection Student Program / ADC Tools Sampler CD Disk 3 1999.iso / Metrowerks CodeWarrior / Java Support / Java_Source / IFC_112 / netscape / application / HTMLParser.java < prev next >

Wrap

Text File | 1999-05-28 | 22.7 KB | 575 lines | [TEXT/CWIE]

// HTMLParser.java // By Ned Etcode // Copyright 1995, 1996, 1997 Netscape Communications Corp. All rights reserved. package netscape.application; import netscape.util.*; import java.io.InputStream; import java.io.IOException; import java.io.FilterInputStream; /** A generic HTML parser. This class provides the HTML * parsing functionality without defining how to store HTML. * The user provides some information, telling the parser * which class should be used for which marker. * The parser creates instances of these classes. * @note 1.0 changes * @private */ public class HTMLParser extends FilterInputStream { /* * Special char to unicode */ private static final String specialChars[] = { "lt", "<", "gt", ">", "amp", "&", "quot", "\"" , "nbsp","\u00a0", "iexcl","\u00a1", "cent","\u00a2", "pound","\u00a3", "curren","\u00a4", "yen","\u00a5", "brvbar","\u00a6", "sect","\u00a7", "uml","\u00a8", "copy","\u00a9", "ordf","\u00aa", "laquo","\u00ab", "not","\u00ac", "shy","\u00ad", "reg","\u00ae", "macr","\u00af", "deg","\u00b0", "plusmn","\u00b1", "sup2","\u00b2", "sup3","\u00b3", "acute","\u00b4", "micro","\u00b5", "para","\u00b6", "middot","\u00b7", "cedil","\u00b8", "sup1","\u00b9", "ordm","\u00ba", "raquo","\u00bb", "frac14","\u00bc", "frac12","\u00bd", "frac34","\u00be", "iquest","\u00bf", "Agrave","\u00c0", "Aacute","\u00c1", "Acirc","\u00c2", "Atilde","\u00c3", "Auml","\u00c4", "Aring","\u00c5", "AElig","\u00c6", "Ccedil","\u00c7", "Egrave","\u00c8", "Eacute","\u00c9", "Ecirc","\u00ca", "Euml","\u00cb", "Igrave","\u00cc", "Iacute","\u00cd", "Icirc","\u00ce", "Iuml","\u00cf", "ETH","\u00d0", "Ntilde","\u00d1", "Ograve","\u00d2", "Oacute","\u00d3", "Ocirc","\u00d4", "Otilde","\u00d5", "Ouml","\u00d6", "times","\u00d7", "Oslash","\u00d8", "Ugrave","\u00d9", "Uacute","\u00da", "Ucirc","\u00db", "Uuml","\u00dc", "Yacute","\u00dd", "THORN","\u00de", "szlig","\u00df", "agrave","\u00e0", "aacute","\u00e1", "acirc","\u00e2", "atilde","\u00e3", "auml","\u00e4", "aring","\u00e5", "aelig","\u00e6", "ccedil","\u00e7", "egrave","\u00e8", "eacute","\u00e9", "ecirc","\u00ea", "euml","\u00eb", "igrave","\u00ec", "iacute","\u00ed", "icirc","\u00ee", "iuml","\u00ef", "eth","\u00f0", "ntilde","\u00f1", "ograve","\u00f2", "oacute","\u00f3", "ocirc","\u00f4", "otilde","\u00f5", "ouml","\u00f6", "divide","\u00f7", "oslash","\u00f8", "ugrave","\u00f9", "uacute","\u00fa", "ucirc","\u00fb", "uuml","\u00fc", "yacute","\u00fd", "thorn","\u00fe", "yuml","\u00ff", "ensp"," ", "emsp"," ", "endash","-", "emdash","-", /* "zwnj","\u200c", "zwj", "\u200d", "lrm", "\u200e", "rlm", "\u200f",*/ }; private HTMLTokenGenerator tokenGenerator; private HTMLParsingRules rules; private Class defaultContainerClass = null; private Class defaultMarkerClass = null; private boolean throwsException = false; private FoundationApplet applet; /** Constructor */ public HTMLParser(InputStream in) { this(in,new HTMLParsingRules()); } public HTMLParser(InputStream in,HTMLParsingRules rules) { super(in); this.rules = rules; tokenGenerator = new HTMLTokenGenerator(in); } /** Set whether the parser should raise when some bad HTML is parsed. * if flag is false, bad statement will be just ignored * The default is false. */ public void setThrowsExceptionOnHTMLError(boolean flag) { throwsException = flag; } /** Return whether the parser throw an exception when some bad HTML is parsed */ public boolean throwsExceptionOnHTMLError() { return throwsException; } /** * Parse the next HTML component */ public HTMLElement nextHTMLElement() throws IOException,HTMLParsingException, java.lang.InstantiationException,java.lang.IllegalAccessException { HTMLElement result; while( tokenGenerator.hasMoreTokens()) { result = parseNextHTMLElement(true,true,null); if( result != null ) return result; } return null; } /** * Utility to convert String containing attributes to Hashtable * Keys will be converted to upper case. */ public static Hashtable hashtableForAttributeString(String attributesString) throws HTMLParsingException { Hashtable result = new Hashtable(); int i,c; String key,value; FastStringBuffer fb = new FastStringBuffer(); int offset; if( attributesString == null ) return result; c = attributesString.length(); i = 0; while( i < c ) { while( i < c && isSpace(attributesString.charAt(i)) ) i++; if( i == c ) break; fb.truncateToLength( 0 ); offset = parseKeyOrValue( attributesString, i , fb ); if( offset == 0 ) { throw new HTMLParsingException("Error while parsing attributes " + attributesString,0); } key = filterKeyOrValue( fb ); key = key.toUpperCase(); i += offset; if( key.equals("")) continue; while( i < c && isSpace(attributesString.charAt(i)) ) i++; if( i < c && attributesString.charAt(i) == '=' ) { /* We have a value */ i++; fb.truncateToLength( 0 ); offset = parseKeyOrValue( attributesString, i, fb ); value = filterKeyOrValue( fb ); i += offset; result.put(key,value); } else { /* Attribute without a value */ result.put(key,""); } } return result; } /** Called on syntax error. Throw an exception if HTMLParsingException is * enabled. Otherwise does nothing. */ public void reportSyntaxError(String description) throws HTMLParsingException { if( throwsException ) throw new HTMLParsingException( description , tokenGenerator.lineForLastToken()); } /** Convenience to avoid breaking constructor */ public void setClassForMarker(Class aClass,String aMarker) { rules.setClassNameForMarker(aClass.getName(),aMarker); } private final char unicodeCharForBytes( String bytes ) { int i,c; String s = bytes; if( s.length() > 0 && s.charAt(0) == '#' ) { return (char) Integer.parseInt(s.substring(1,s.length())); } for(i = 0 , c = specialChars.length ; i < c ; i += 2 ) { if( specialChars[i].equals( s )) return specialChars[i+1].charAt(0); } return 0; } private final int convertSpecialCharacter(String s,int startIndex,FastStringBuffer result ){ int length = s.length(); char theChar; if( (startIndex+1) < length ) { int start = startIndex + 1; int end = start; char ch; ch = s.charAt(end); while( end < length && ch != ';' && ch != ' ' && ch != '\n' && ch != '\t' ) { end++; if( end < length ) ch = s.charAt(end); else ch = 0; } if( end > start ) { String subStr; subStr = s.substring(start,start+(end-start)); theChar = unicodeCharForBytes( subStr ); if( theChar != 0 && theChar != 8 ) result.append( theChar ); if( end < length && s.charAt(end) == ';') return subStr.length() + 2; /* + 1 for the starting & and the ; */ else return subStr.length() + 1; } } return 0; } private final String filterHTMLString(String s,boolean filterSpaces, boolean allowSpaceForFirstChar) { FastStringBuffer sb = new FastStringBuffer(); int i,c,delta; char ch; boolean previousCharWasSpace = false; boolean nonSpaceCharFound = false; for(i=0,c=s.length() ; i < c ; i++) { ch = s.charAt(i); if(filterSpaces && (ch == ' ' || ch == '\t' || ch == '\n') ) { if( !nonSpaceCharFound && ((allowSpaceForFirstChar && (ch == '\t' || ch == '\n')) || (!allowSpaceForFirstChar && (ch == '\t' || ch == '\n' || ch == ' ')))) continue; if( previousCharWasSpace ) continue; else { previousCharWasSpace = true; sb.append(' '); continue; } } else if( ch == '&' ) { delta = convertSpecialCharacter(s,i,sb ); if( delta > 0 ) i += (delta - 1); /* -1 since i++ will happen before the next iteration */ previousCharWasSpace = false; nonSpaceCharFound = true; continue; } else if(ch != '\n' && ch != '\t' && (ch < ' ' || ch > '~') ) /* Should filter these characters */ continue; previousCharWasSpace = false; nonSpaceCharFound = true; sb.append( ch ); } if( sb.length() > 0 ) return sb.toString(); else return null; } private Class classForMarker(String aMarker) { String className = rules.classNameForMarker(aMarker); if( className != null ) { Class c; try { Application app = Application.application(); if(app != null) c = app.classForName(className); else c = Class.forName(className); } catch(ClassNotFoundException e) { System.err.println("" + e); c = null; } return c; } return null; } private final HTMLElement parseNextHTMLElement(boolean doFilterStrings, boolean allowSpaceAsFirstChar, String pMarker) throws IOException,HTMLParsingException, java.lang.InstantiationException,java.lang.IllegalAccessException { int token; HTMLElement result = null; Class c; String marker; Hashtable markerRules; token = tokenGenerator.nextToken(); switch( token ) { case HTMLTokenGenerator.STRING_TOKEN: if((c = classForMarker(HTMLParsingRules.STRING_MARKER_KEY)) != null ) { String s = tokenGenerator.stringForLastToken(); s = filterHTMLString(s,doFilterStrings,allowSpaceAsFirstChar); if( s != null ) { /* Filter might remove string with only spaces */ result = (HTMLElement) c.newInstance(); result.setMarker(HTMLParsingRules.STRING_MARKER_KEY); result.setString( s ); return result; } } break; case HTMLTokenGenerator.MARKER_BEGIN_TOKEN: marker = tokenGenerator.stringForLastToken(); markerRules = rules.rulesForMarker(marker); if( (c = classForMarker(marker)) != null) { if( rules.isContainer(markerRules)) { HTMLElement nextChild; Vector beginTerminators = null; Vector endTerminators = null; Object children[],tmp[]; int childrenCount; boolean endMarkerFound = false; boolean notFirstChild = false; result = (HTMLElement) c.newInstance(); result.setMarker( marker ); result.setAttributes( tokenGenerator.attributesForLastToken()); children = new Object[2]; childrenCount = 0; if( markerRules != null ) { beginTerminators = (Vector) markerRules.get( HTMLParsingRules.BEGIN_TERMINATION_MARKERS_KEY); endTerminators = (Vector) markerRules.get( HTMLParsingRules.END_TERMINATION_MARKERS_KEY); } while( tokenGenerator.hasMoreTokens() ) { token = tokenGenerator.peekNextToken(); if( token == HTMLTokenGenerator.MARKER_END_TOKEN ) { String endMarker = tokenGenerator.stringForLastToken(); if(marker.equals(endMarker)) { tokenGenerator.nextToken(); /* Remove the token */ endMarkerFound = true; break; } else if( endTerminators != null && endTerminators.indexOf(endMarker)!=-1) { endMarkerFound=true; break; } else if(classForMarker(endMarker) != null) { /** Unexpected end for a known marker * This is an error but we should * stop parsing the current marker. * to allow the known marker to be * closed. This strategy avoid having * very deep trees when some closing * markers are not in the right scope */ reportSyntaxError("Unexcpected closing " + endMarker + " while parsing contents for " + marker ); endMarkerFound=true; break; } } else if( token == HTMLTokenGenerator.MARKER_BEGIN_TOKEN && beginTerminators != null && beginTerminators.indexOf(tokenGenerator.stringForLastToken()) != -1 ) { endMarkerFound = true; break; } /* Should filter strings if the marker requires it or * one of the parent requires it. */ if( rules.shouldFilterStringsForChildren(markerRules) && doFilterStrings==true ) nextChild = parseNextHTMLElement(true,notFirstChild,marker); else nextChild = parseNextHTMLElement(false,notFirstChild,marker); notFirstChild = true; if( nextChild == null ) { if( tokenGenerator.hasMoreTokens() == false ) { reportSyntaxError("Unterminated marker " + marker); break; } else continue; } else { children[childrenCount++] = nextChild; if( childrenCount == children.length ) { Object newChildren[] = new Object[children.length * 2]; System.arraycopy(children,0,newChildren,0,childrenCount); children = newChildren; } } } if( childrenCount > 0 ) { tmp = new Object[childrenCount]; System.arraycopy(children,0,tmp,0,childrenCount); result.setChildren( tmp ); } else result.setChildren( null ); if(! endMarkerFound ) { reportSyntaxError("No end found for marker " + marker); } return result; } else { result = (HTMLElement) c.newInstance(); result.setMarker( marker ); result.setAttributes( tokenGenerator.attributesForLastToken()); return result; } } break; case HTMLTokenGenerator.COMMENT_TOKEN: if((c = classForMarker(HTMLParsingRules.COMMENT_MARKER_KEY)) != null ) { String s = tokenGenerator.stringForLastToken(); result = (HTMLElement) c.newInstance(); result.setMarker(HTMLParsingRules.COMMENT_MARKER_KEY); result.setString( s ); return result; } break; case HTMLTokenGenerator.MARKER_END_TOKEN: marker = tokenGenerator.stringForLastToken(); c = classForMarker(marker); if( c != null && !rules.shouldIgnoreEnd( rules.rulesForMarker( marker ))) { reportSyntaxError("Unexpected closing " + marker + " while parsing contents for marker " + pMarker); } break; default: reportSyntaxError("Unexpected statement"); } return null; } private static boolean isSpace(char c) { if( c == ' ' || c == '\t' || c == '\n' ) return true; else return false; } private static int parseKeyOrValue(String source,int index,FastStringBuffer dest) { int start,end,length; start = index; length = source.length(); char endChar = 0; while(start < length && isSpace(source.charAt(start))) start++; if( start == length ) return 0; end = start; if( source.charAt(end) == '\'' || source.charAt(end) == '"' ) endChar = source.charAt(end); do { dest.append(source.charAt(end)); end++; } while(end < length && ((endChar == 0 && !isSpace(source.charAt(end)) && source.charAt(end) != '=' ) || (endChar != 0 && source.charAt(end) != endChar))); if( end < length && source.charAt(end) == endChar ) { dest.append(source.charAt(end)); end++; } return end - start; } /* Remove " or '. */ private static String filterKeyOrValue(FastStringBuffer source) { int c = source.length(); if( c == 0 ) return ""; if( source.charAt(0) == '\'' || source.charAt(0) == '"' ) { if( c <= 2 ) return ""; else return source.toString().substring(1,c-1); } return source.toString(); } }